Predicting Future Consumer Purchases

In this group research project, we are attempting to predict whether or not a customer reorders a given product based on their order history. In each order history, we have a breakdown of all the products purchased for that given order, including the pattern of which they put the item into their cart, the time of day they ordered, and the day of the week.

The data already had training data held out for our models to train on, and test data points kept to test our models on. However, we do not have access to these test data points unless we submit our models, which is not what we’re trying to do. So instead, we will need to redefine the original training data set (1.4m obs) to become our test points, and the prior data set (30m obs) as our training data.

More information about the project & data can be found on github: Click here for more information

# Importing Data & Data Exploration

Initial Data Exploration

# Data exploration
head(order_prior, 12) # Details of all the purchased products in a given order for each order_id
##    order_id product_id add_to_cart_order reordered
## 1         2      33120                 1         1
## 2         2      28985                 2         1
## 3         2       9327                 3         0
## 4         2      45918                 4         1
## 5         2      30035                 5         0
## 6         2      17794                 6         1
## 7         2      40141                 7         1
## 8         2       1819                 8         1
## 9         2      43668                 9         0
## 10        3      33754                 1         1
## 11        3      24838                 2         1
## 12        3      17704                 3         1
head(order_train, 12) # Subset of training data in same format as order_prior
##    order_id product_id add_to_cart_order reordered
## 1         1      49302                 1         1
## 2         1      11109                 2         1
## 3         1      10246                 3         0
## 4         1      49683                 4         0
## 5         1      43633                 5         1
## 6         1      13176                 6         0
## 7         1      47209                 7         0
## 8         1      22035                 8         1
## 9        36      39612                 1         0
## 10       36      19660                 2         1
## 11       36      49235                 3         0
## 12       36      43086                 4         1
head(orders, 12) # History of all orders for user and the associated order_id
##    order_id user_id eval_set order_number order_dow order_hour_of_day
## 1   2539329       1    prior            1         2                 8
## 2   2398795       1    prior            2         3                 7
## 3    473747       1    prior            3         3                12
## 4   2254736       1    prior            4         4                 7
## 5    431534       1    prior            5         4                15
## 6   3367565       1    prior            6         2                 7
## 7    550135       1    prior            7         1                 9
## 8   3108588       1    prior            8         1                14
## 9   2295261       1    prior            9         1                16
## 10  2550362       1    prior           10         4                 8
## 11  1187899       1    train           11         4                 8
## 12  2168274       2    prior            1         2                11
##    days_since_prior_order
## 1                      NA
## 2                      15
## 3                      21
## 4                      29
## 5                      28
## 6                      19
## 7                      20
## 8                      14
## 9                       0
## 10                     30
## 11                     14
## 12                     NA
head(products) # Names of all the products
##   product_id                                                      product_name
## 1          1                                        Chocolate Sandwich Cookies
## 2          2                                                  All-Seasons Salt
## 3          3                              Robust Golden Unsweetened Oolong Tea
## 4          4 Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce
## 5          5                                         Green Chile Anytime Sauce
## 6          6                                                      Dry Nose Oil
##   aisle_id department_id
## 1       61            19
## 2      104            13
## 3       94             7
## 4       38             1
## 5        5            13
## 6       11            11
head(aisles) # Names of all the aisles
##   aisle_id                      aisle
## 1        1      prepared soups salads
## 2        2          specialty cheeses
## 3        3        energy granola bars
## 4        4              instant foods
## 5        5 marinades meat preparation
## 6        6                      other
head(depts) # Names of all the departments
##   department_id    department
## 1             1        frozen
## 2             2         other
## 3             3        bakery
## 4             4       produce
## 5             5       alcohol
## 6             6 international

# Data Manipulation & Visualization

# Response variable: reordered (1/0)
# We want to predict whether a given product is reordered or not by a customer
### put info and distribution of response here, its relationship w/ other variables
par(mfrow=c(2,2))
# Distribution of order by hour of day
ggplot(orders, aes(x = order_hour_of_day, fill = as.factor(order_hour_of_day))) + 
  geom_histogram(bins = 24) +
  labs(title = "Order Frequency by Hour",
       x = "Time of Day") +
  theme_Publication() + 
  theme(legend.position = "none")

# See how day of week variable is coded (numeric)
class(orders$order_dow)
## [1] "integer"
# Changing DOW variable to be a factor variable with character labels
orders %>% 
  mutate(order_dow = factor(order_dow, labels = c("Saturday", "Sunday", "Monday",
                                                  "Tuesday", "Wednesday", "Thursday",
                                                  "Friday"))) %>% 
  ggplot(., aes(x = order_dow, fill = as.factor(order_dow))) + 
  geom_bar(width = 0.75, ) +
  labs(title = "Order Frequency by Day",
       x = " Day of the Week") +
  theme_Publication() +           # Distribution of order by day of the week
  scale_fill_Publication() + 
  theme(legend.position = "none")
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## Warning: The `scale_name` argument of `discrete_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Distribution of order frequency
ggplot(orders, aes(x = days_since_prior_order, fill = as.factor(days_since_prior_order))) + 
geom_histogram(bins = 30) +
labs(title = "Frequency of Customer Ordering",
     x = " Days since Prior Order") +
theme_Publication() +     
theme(legend.position = "none") # We see most reorders happen bw/ 1-8 days. Interesting pattern -- we see a spike in reordering every 7 days. Finally, we see the largest single spike at 30 days -- not sure as if I can interpret this as just 30 days or maybe 30+ days.
## Warning: Removed 206209 rows containing non-finite outside the scale range
## (`stat_bin()`).

# How many NA values are in our "days_since_prior_order" column
table(is.na(orders$days_since_prior_order)) # 206209
## 
##   FALSE    TRUE 
## 3214874  206209
# Most ordered products? 
## subset of product data
product_freq_table <- order_prior %>% 
  count(product_id) %>% # count frequency of each product
  arrange(desc(n)) %>% # arrange in descending order
  inner_join(., products, by = "product_id") %>% # merge data sets to bring in associated product names
  select(-c(aisle_id, department_id)) %>%  # subset of only columns product_id, name, and frequency (n)
  slice(1:20) # access the 20 most frequent orders

## plot of most frequently ordered products
ggplot(product_freq_table, aes(x = reorder(product_name, -n), y = n, fill = as.factor(product_name))) + 
geom_bar(stat = "identity") +
labs(title = "Product Order Frequency",
     x = "Name of Product",
     y = "Total orders") +
theme_Publication() +  
scale_colour_Publication() +
theme(legend.position = "none",
      axis.text.x = element_text(angle = 45, hjust = 1))

## plot of most frequently ordered products, by aisle
product_freq_table %>% 
  left_join(products, by = "product_name") %>% 
  select(n, department_id, aisle_id) %>% 
  left_join(depts, by = "department_id") %>% 
  left_join(aisles, by = "aisle_id") %>% 
  select(n, department, aisle) %>% 
  ggplot(., aes(x = reorder(aisle, -n), y = n, fill = as.factor(aisle))) + 
  geom_bar(stat = "identity") +
  labs(title = "Product Order Frequency",
       x = "Name of Aisle",
       y = "Total orders") +
  theme_Publication() +  
  scale_colour_Publication() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

Merging Data

First, I want to create a single data frame that combines all the order-specific information to the user-specific information, so that we can add more variables that are order-level and user-level into the same data frame, which will be useful for our classification models later.

# Create a full data set of all orders with all information on each order -- order_id, user_id, all products in that order, the associated aisle & department, etc.
full_order_data <- orders %>% 
  full_join(., order_train, by = "order_id") %>% 
  inner_join(., order_prior, by = "order_id") %>% 
  arrange(user_id, order_number) %>%  
  mutate(product_id = product_id.y,
         add_to_cart_order = add_to_cart_order.y,
         reordered = reordered.y,
         order_dow = factor(order_dow, 
                            levels = c(0, 1, 2, 3, 4, 5, 6),
                            labels = c("Saturday", "Sunday", "Monday", 
                                       "Tuesday", "Wednesday", "Thursday", "Friday"))) %>% 
  select(-c(product_id.x, add_to_cart_order.x, reordered.x,
            product_id.y, add_to_cart_order.y, reordered.y)) %>% 
  inner_join(., products, by = "product_id") %>% 
  inner_join(., aisles, by = "aisle_id") %>% 
  inner_join(., depts, by = "department_id") %>% 
  select(-c(aisle_id, department_id, product_id))

head(full_order_data) # looks good
##   order_id user_id eval_set order_number order_dow order_hour_of_day
## 1  2539329       1    prior            1    Monday                 8
## 2  2539329       1    prior            1    Monday                 8
## 3  2539329       1    prior            1    Monday                 8
## 4  2539329       1    prior            1    Monday                 8
## 5  2539329       1    prior            1    Monday                 8
## 6  2398795       1    prior            2   Tuesday                 7
##   days_since_prior_order add_to_cart_order reordered
## 1                     NA                 1         0
## 2                     NA                 2         0
## 3                     NA                 3         0
## 4                     NA                 4         0
## 5                     NA                 5         0
## 6                     15                 1         1
##                              product_name           aisle department
## 1                                    Soda     soft drinks  beverages
## 2 Organic Unsweetened Vanilla Almond Milk soy lactosefree dairy eggs
## 3                     Original Beef Jerky   popcorn jerky     snacks
## 4              Aged White Cheddar Popcorn   popcorn jerky     snacks
## 5        XL Pick-A-Size Paper Towel Rolls     paper goods  household
## 6                                    Soda     soft drinks  beverages
# Remove extra objects from environment
rm(order_prior, order_train, orders)

Variable Creation

## Now, we will be creating 9 new variables that will be used to explore further relationships in the data & added as part of the regression models.

New Variable 1:

Frequency each Customer Orders a Given Product

# Now I want to create new variables
## First -- frequency each customer orders a given product
product_count_per_user <- full_order_data %>% 
  group_by(user_id, product_name) %>% 
  summarize(product_count = n()) %>% 
  arrange(user_id, desc(product_count))
## `summarise()` has grouped output by 'user_id'. You can override using the
## `.groups` argument.
head(product_count_per_user)
## # A tibble: 6 × 3
## # Groups:   user_id [1]
##   user_id product_name          product_count
##     <int> <chr>                         <int>
## 1       1 Original Beef Jerky              10
## 2       1 Soda                             10
## 3       1 Pistachios                        9
## 4       1 Organic String Cheese             8
## 5       1 Cinnamon Toast Crunch             3
## 6       1 Zero Calorie Cola                 3

New Variable 2:

Average time of day & day of the week each customer places an order

## Average time of day & DOW customer places an order
avg_order_time <- full_order_data %>% 
  group_by(user_id) %>%
  summarize(avg_hour = mean(order_hour_of_day),
            avg_day = mean(as.numeric(order_dow)))

head(avg_order_time)
## # A tibble: 6 × 3
##   user_id avg_hour avg_day
##     <int>    <dbl>   <dbl>
## 1       1     10.5    3.64
## 2       2     10.4    3.01
## 3       3     16.4    2.01
## 4       4     13.1    5.72
## 5       5     15.7    2.62
## 6       6     17      4.86
#plot -- heatmap
ggplot(avg_order_time, aes(x = avg_hour, y = avg_day)) +
  geom_tile(aes(fill = after_stat(count)), stat = "bin2d") +
  labs(x = "Average Hour of Day", y = "Average Day of Week",
       title = "Average Order Time") +
  scale_y_continuous(breaks = 0:6, 
                     labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")) +
  theme_Publication() +
  scale_fill_gradient(low = "blue", high = "red")

New Variables 3 & 4:

Which products are ordered more frequently based on the day of the week?

Which products are ordered more frequently based on the time of the day?

# What products are ordered more frequently based on DOW?
dow_freq_order <- full_order_data %>% 
  group_by(order_dow, product_name) %>%
  summarize(product_count_day = n()) %>% 
  arrange(order_dow, desc(product_count_day))
## `summarise()` has grouped output by 'order_dow'. You can override using the
## `.groups` argument.
head(dow_freq_order)
## # A tibble: 6 × 3
## # Groups:   order_dow [1]
##   order_dow product_name           product_count_day
##   <fct>     <chr>                              <int>
## 1 Saturday  Banana                             96769
## 2 Saturday  Bag of Organic Bananas             71493
## 3 Saturday  Organic Baby Spinach               54914
## 4 Saturday  Organic Strawberries               53831
## 5 Saturday  Organic Hass Avocado               43944
## 6 Saturday  Organic Avocado                    39846
# Top 3 products for each day of the week
dow_freq_order %>%
  group_by(order_dow) %>%
  top_n(3, product_count_day) %>% 
  ggplot(., aes(x = reorder(product_name, -product_count_day), y = product_count_day)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Product Name", y = "Product Count",
       title = "Top 3 Products Ordered by Day of Week") +
  theme_clean() +
  scale_fill_Publication() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~ order_dow, nrow = 2, scales = "free_x")

dow_freq_order %>%
  group_by(order_dow) %>%
  top_n(6, product_count_day) %>% 
  ggplot(., aes(x = reorder(product_name, -product_count_day), y = product_count_day)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Product Name", y = "Product Count",
       title = "Top 3 Products Ordered by Day of Week") +
  theme_clean() +
  scale_fill_Publication() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~ order_dow, nrow = 2, scales = "free_x")

# What products are ordered more frequently based on time of day?
hour_freq_order <- full_order_data %>%
  group_by(order_hour_of_day, product_name) %>%
  summarize(product_count_hour = n()) %>% 
  arrange(order_hour_of_day, desc(product_count_hour))
## `summarise()` has grouped output by 'order_hour_of_day'. You can override using
## the `.groups` argument.
head(hour_freq_order)
## # A tibble: 6 × 3
## # Groups:   order_hour_of_day [1]
##   order_hour_of_day product_name           product_count_hour
##               <int> <chr>                               <int>
## 1                 0 Banana                               2815
## 2                 0 Bag of Organic Bananas               2712
## 3                 0 Organic Strawberries                 1839
## 4                 0 Organic Baby Spinach                 1768
## 5                 0 Organic Hass Avocado                 1398
## 6                 0 Organic Avocado                      1176
# Top 3 products for each hour of the day
hour_freq_order %>%
  group_by(order_hour_of_day) %>%
  top_n(3, product_count_hour) %>% 
ggplot(., aes(x = reorder(product_name, -product_count_hour), y = product_count_hour)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Product Name", y = "Product Count",
       title = "Top 3 Products Ordered by Hour of Day") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
  facet_wrap(~ order_hour_of_day, nrow = 2, scales = "free_x")

New Variables 5 & 6:

Size of each individual order

Average order size per customer

# Calculate order size
order_size <- full_order_data %>%
  group_by(order_id) %>%
  summarize(order_size = n())

head(order_size)
## # A tibble: 6 × 2
##   order_id order_size
##      <int>      <int>
## 1        2          9
## 2        3          8
## 3        4         13
## 4        5         26
## 5        6          3
## 6        7          2
#plot

# Aggregate order size at the customer level and calculate average order size
avg_order_size <- order_size %>%
  inner_join(full_order_data, by = "order_id") %>%
  group_by(user_id) %>%
  summarize(avg_order_size = mean(order_size))

head(avg_order_size)
## # A tibble: 6 × 2
##   user_id avg_order_size
##     <int>          <dbl>
## 1       1           6.25
## 2       2          16.1 
## 3       3           7.89
## 4       4           4.56
## 5       5          10.0 
## 6       6           5.29

New Variables 7 & 8:

Which products are reordered the most?

User-level product reordering ratio

# Which products are reordered the most?
reorder_counts <- full_order_data %>%
  group_by(product_name) %>%
  summarize(total_reorders = sum(reordered)) %>% 
  arrange(desc(total_reorders))

head(reorder_counts)
## # A tibble: 6 × 2
##   product_name           total_reorders
##   <chr>                           <int>
## 1 Banana                         398609
## 2 Bag of Organic Bananas         315913
## 3 Organic Strawberries           205845
## 4 Organic Baby Spinach           186884
## 5 Organic Hass Avocado           170131
## 6 Organic Avocado                134044
# plot
reorder_counts %>% 
  slice(1:20) %>% 
  ggplot(., aes(x = reorder(product_name, -total_reorders), y = total_reorders, 
                fill = as.factor(product_name))) + 
  geom_bar(stat = "identity") +
  labs(title = "Product Reorder Frequency",
     x = "Name of Product",
     y = "Total Reorders") +
  theme_Publication() +  
  scale_colour_Publication() +
  theme(legend.position = "none",
      axis.text.x = element_text(angle = 45, hjust = 1))

# Product reorder ratio, per user
product_reorder_ratio <- full_order_data %>%
  group_by(user_id, product_name) %>%
  summarize(product_reorder_ratio = sum(reordered) / n()) %>%
  arrange(user_id, desc(product_reorder_ratio))
## `summarise()` has grouped output by 'user_id'. You can override using the
## `.groups` argument.
product_reorder_ratio
## # A tibble: 13,307,953 × 3
## # Groups:   user_id [206,209]
##    user_id product_name                     product_reorder_ratio
##      <int> <chr>                                            <dbl>
##  1       1 Original Beef Jerky                              0.9  
##  2       1 Soda                                             0.9  
##  3       1 Pistachios                                       0.889
##  4       1 Organic String Cheese                            0.875
##  5       1 Cinnamon Toast Crunch                            0.667
##  6       1 Zero Calorie Cola                                0.667
##  7       1 Aged White Cheddar Popcorn                       0.5  
##  8       1 Bag of Organic Bananas                           0.5  
##  9       1 Organic Half & Half                              0.5  
## 10       1 XL Pick-A-Size Paper Towel Rolls                 0.5  
## # ℹ 13,307,943 more rows
# Distributions of all the variables. Try to replicate that cool graph of which products are linked

# Bonus: Distribution of 'reordered' outcome variable
full_order_data %>% 
  mutate(reordered = factor(reordered,
                            levels = c(0,1),
                            labels = c("No", "Yes"))) %>% 
  ggplot(., aes(reordered, fill = reordered)) +
  geom_bar() + labs(x = "Reordered Product?",
       title = "Distribution of Reordering Products") +
  scale_x_discrete(labels = c("No", "Yes")) +
  theme_Publication() +
  scale_colour_Publication() +
  theme(legend.position = "none" )

New Variable 9:

Reorder rate by item position in cart

# What is the relationship between the order in which an item is added to the cart & reordering it?
reorder_rate_by_position <- full_order_data %>%
  group_by(add_to_cart_order) %>%
  summarize(reorder_rate = mean(reordered)) 

# plot
ggplot(reorder_rate_by_position, aes(x = add_to_cart_order, y = reorder_rate)) +
  geom_line() +
  geom_point() +
  labs(x = "Cart Position", y = "Reorder Rate", title = "Reorder Rate by Cart Position") +
  theme_Publication() +
  scale_fill_Publication()

# Logarithmic relationship until cart has ~50 items -- curious to see if these are just some outlier cases

# All different order sizes -- ranges from 1:145 items in a single order
range(full_order_data$add_to_cart_order)
## [1]   1 145
# Filter observations for orders of over 50 items
full_order_data %>% 
  filter(add_to_cart_order > 50) %>% 
  group_by(order_id)
## # A tibble: 26,359 × 12
## # Groups:   order_id [3,081]
##    order_id user_id eval_set order_number order_dow order_hour_of_day
##       <int>   <int> <chr>           <int> <fct>                 <int>
##  1  3306184     133 prior              17 Wednesday                 3
##  2  3306184     133 prior              17 Wednesday                 3
##  3  3306184     133 prior              17 Wednesday                 3
##  4  3306184     133 prior              17 Wednesday                 3
##  5  3306184     133 prior              17 Wednesday                 3
##  6  3306184     133 prior              17 Wednesday                 3
##  7  3306184     133 prior              17 Wednesday                 3
##  8  3306184     133 prior              17 Wednesday                 3
##  9  1730857     153 prior               9 Sunday                   14
## 10  1730857     153 prior               9 Sunday                   14
## # ℹ 26,349 more rows
## # ℹ 6 more variables: days_since_prior_order <dbl>, add_to_cart_order <int>,
## #   reordered <int>, product_name <chr>, aisle <chr>, department <chr>
# Only 26,359 observations for 3,081 orders.. 
3081/32434489
## [1] 9.499148e-05
# That's only 0.0095% of our total orders -- so we only care about orders below this threshold

# Redefine variable to filter only orders with 50 or less items in cart
reorder_rate_by_position <- reorder_rate_by_position %>% 
  filter(add_to_cart_order <= 50) 

# New plot -- looks much better. We now see a clear relationship b/w cart position & reordering
ggplot(reorder_rate_by_position, aes(x = add_to_cart_order, y = reorder_rate)) +
  geom_line() +
  geom_point() +
  labs(x = "Cart Position", y = "Reorder Rate", title = "Reorder Rate by Cart Position") +
  theme_Publication() +
  scale_fill_Publication()

head(reorder_rate_by_position)
## # A tibble: 6 × 2
##   add_to_cart_order reorder_rate
##               <int>        <dbl>
## 1                 1        0.678
## 2                 2        0.676
## 3                 3        0.658
## 4                 4        0.637
## 5                 5        0.617
## 6                 6        0.600

Merge New Variables back into Full DF

# How it looks before merging
head(full_order_data)
##   order_id user_id eval_set order_number order_dow order_hour_of_day
## 1  2539329       1    prior            1    Monday                 8
## 2  2539329       1    prior            1    Monday                 8
## 3  2539329       1    prior            1    Monday                 8
## 4  2539329       1    prior            1    Monday                 8
## 5  2539329       1    prior            1    Monday                 8
## 6  2398795       1    prior            2   Tuesday                 7
##   days_since_prior_order add_to_cart_order reordered
## 1                     NA                 1         0
## 2                     NA                 2         0
## 3                     NA                 3         0
## 4                     NA                 4         0
## 5                     NA                 5         0
## 6                     15                 1         1
##                              product_name           aisle department
## 1                                    Soda     soft drinks  beverages
## 2 Organic Unsweetened Vanilla Almond Milk soy lactosefree dairy eggs
## 3                     Original Beef Jerky   popcorn jerky     snacks
## 4              Aged White Cheddar Popcorn   popcorn jerky     snacks
## 5        XL Pick-A-Size Paper Towel Rolls     paper goods  household
## 6                                    Soda     soft drinks  beverages
full_order_data <- full_order_data %>% 
  left_join(., product_count_per_user, by = c("user_id", "product_name")) %>% 
  left_join(., avg_order_time, by = "user_id") %>% 
  left_join(., dow_freq_order, by = c("order_dow", "product_name")) %>% 
  left_join(., hour_freq_order, by = c("order_hour_of_day", "product_name")) %>% 
  left_join(., order_size, by = "order_id") %>% 
  left_join(., avg_order_size, by = "user_id") %>% 
  left_join(., reorder_counts, by = "product_name") %>% 
  left_join(., product_reorder_ratio, by = c("user_id", "product_name")) %>% 
  left_join(., reorder_rate_by_position, by = "add_to_cart_order")

# New final data set -- looks good
head(full_order_data)
##   order_id user_id eval_set order_number order_dow order_hour_of_day
## 1  2539329       1    prior            1    Monday                 8
## 2  2539329       1    prior            1    Monday                 8
## 3  2539329       1    prior            1    Monday                 8
## 4  2539329       1    prior            1    Monday                 8
## 5  2539329       1    prior            1    Monday                 8
## 6  2398795       1    prior            2   Tuesday                 7
##   days_since_prior_order add_to_cart_order reordered
## 1                     NA                 1         0
## 2                     NA                 2         0
## 3                     NA                 3         0
## 4                     NA                 4         0
## 5                     NA                 5         0
## 6                     15                 1         1
##                              product_name           aisle department
## 1                                    Soda     soft drinks  beverages
## 2 Organic Unsweetened Vanilla Almond Milk soy lactosefree dairy eggs
## 3                     Original Beef Jerky   popcorn jerky     snacks
## 4              Aged White Cheddar Popcorn   popcorn jerky     snacks
## 5        XL Pick-A-Size Paper Towel Rolls     paper goods  household
## 6                                    Soda     soft drinks  beverages
##   product_count avg_hour  avg_day product_count_day product_count_hour
## 1            10 10.54237 3.644068              5953               2135
## 2             1 10.54237 3.644068              2019                885
## 3            10 10.54237 3.644068              1159                423
## 4             2 10.54237 3.644068               313                125
## 5             2 10.54237 3.644068               177                 81
## 6            10 10.54237 3.644068              5499                687
##   order_size avg_order_size total_reorders product_reorder_ratio reorder_rate
## 1          5       6.254237          27791                   0.9    0.6775329
## 2          5       6.254237          12923                   0.0    0.6762507
## 3          5       6.254237           4797                   0.9    0.6580367
## 4          5       6.254237           1360                   0.5    0.6369578
## 5          5       6.254237            536                   0.5    0.6173831
## 6          6       6.254237          27791                   0.9    0.6775329
# Remove extra objects from environment
rm(hour_freq_order, order_size, product_count_per_user, product_reorder_ratio,
   reorder_counts, reorder_rate_by_position, avg_order_size, avg_order_time,
   dow_freq_order, products, aisles, depts)

# Summary statistics for all variables in final data set
summary(full_order_data)
##     order_id          user_id         eval_set          order_number  
##  Min.   :      2   Min.   :     1   Length:32434489    Min.   : 1.00  
##  1st Qu.: 855943   1st Qu.: 51421   Class :character   1st Qu.: 5.00  
##  Median :1711048   Median :102611   Mode  :character   Median :11.00  
##  Mean   :1710749   Mean   :102937                      Mean   :17.14  
##  3rd Qu.:2565514   3rd Qu.:154391                      3rd Qu.:24.00  
##  Max.   :3421083   Max.   :206209                      Max.   :99.00  
##                                                                       
##      order_dow       order_hour_of_day days_since_prior_order add_to_cart_order
##  Saturday :6209666   Min.   : 0.00     Min.   : 0.0           Min.   :  1.000  
##  Sunday   :5665856   1st Qu.:10.00     1st Qu.: 5.0           1st Qu.:  3.000  
##  Monday   :4217798   Median :13.00     Median : 8.0           Median :  6.000  
##  Tuesday  :3844117   Mean   :13.42     Mean   :11.1           Mean   :  8.351  
##  Wednesday:3787215   3rd Qu.:16.00     3rd Qu.:15.0           3rd Qu.: 11.000  
##  Thursday :4209533   Max.   :23.00     Max.   :30.0           Max.   :145.000  
##  Friday   :4500304                     NA's   :2078068                         
##    reordered      product_name          aisle            department       
##  Min.   :0.0000   Length:32434489    Length:32434489    Length:32434489   
##  1st Qu.:0.0000   Class :character   Class :character   Class :character  
##  Median :1.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.5897                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##                                                                           
##  product_count       avg_hour        avg_day      product_count_day
##  Min.   : 1.000   Min.   : 0.00   Min.   :1.000   Min.   :    1    
##  1st Qu.: 2.000   1st Qu.:12.20   1st Qu.:3.244   1st Qu.:  237    
##  Median : 4.000   Median :13.41   Median :3.754   Median :  981    
##  Mean   : 7.621   Mean   :13.42   Mean   :3.739   Mean   : 5411    
##  3rd Qu.:10.000   3rd Qu.:14.62   3rd Qu.:4.226   3rd Qu.: 4206    
##  Max.   :99.000   Max.   :23.00   Max.   :7.000   Max.   :96769    
##                                                                    
##  product_count_hour   order_size    avg_order_size  total_reorders  
##  Min.   :    1      Min.   :  1.0   Min.   : 1.00   Min.   :     0  
##  1st Qu.:   96      1st Qu.:  9.0   1st Qu.:10.50   1st Qu.:   832  
##  Median :  414      Median : 14.0   Median :14.55   Median :  3873  
##  Mean   : 2399      Mean   : 15.7   Mean   :15.70   Mean   : 26593  
##  3rd Qu.: 1773      3rd Qu.: 21.0   3rd Qu.:19.63   3rd Qu.: 18541  
##  Max.   :40731      Max.   :145.0   Max.   :78.55   Max.   :398609  
##                                                                     
##  product_reorder_ratio  reorder_rate  
##  Min.   :0.0000        Min.   :0.397  
##  1st Qu.:0.5000        1st Qu.:0.541  
##  Median :0.7500        Median :0.600  
##  Mean   :0.5897        Mean   :0.590  
##  3rd Qu.:0.9000        3rd Qu.:0.658  
##  Max.   :0.9899        Max.   :0.678  
##                        NA's   :26359

Correlation Matrix

# Class of each variable
full_order_data %>% 
  sapply(class)
##               order_id                user_id               eval_set 
##              "integer"              "integer"            "character" 
##           order_number              order_dow      order_hour_of_day 
##              "integer"               "factor"              "integer" 
## days_since_prior_order      add_to_cart_order              reordered 
##              "numeric"              "integer"              "integer" 
##           product_name                  aisle             department 
##            "character"            "character"            "character" 
##          product_count               avg_hour                avg_day 
##              "integer"              "numeric"              "numeric" 
##      product_count_day     product_count_hour             order_size 
##              "integer"              "integer"              "integer" 
##         avg_order_size         total_reorders  product_reorder_ratio 
##              "numeric"              "integer"              "numeric" 
##           reorder_rate 
##              "numeric"
# Correlation Matrix of Variables
corr_matrix <- full_order_data %>% 
  mutate(order_dow_numeric = as.numeric(order_dow)) %>% 
  {cor(.[,-c(1, 2, 3, 5, 10, 11, 12)],
       use = "pairwise.complete.obs")}

# Visualize Corr Matrix
corrplot(corr_matrix, method="shade", type="lower", 
         title="Instacart Correlation Matrix",
         tl.cex = 0.75, tl.col = "black", tl.srt = 45,
         number.cex = 0.5,
         mar = c(0,0,1,0)) 

Mostly uncorrelated variables – some that are correlated tend to be because they were created by that variable (reordered / product_reorder_ratio). We do find that reorder_rate & add_to_cart_order have a negative relationship, alongside reorder_rate & order_size. Total_reorders has a positive correlation w/ product_count_day & product_count_hour.

Model Building

For our models – we have a huge data set (32m observations). I tried running it with the full dataset initally, and it takes waaay too long as it is computationally very expensive on my macbook. Instead, I decided to first subset a random sample of the data so it moves down from 32m obs -> 8m obs, 1/4 of the full data set. I understand by doing this we lose a lot of our data, but it is still large enough where I don’t expect this to be enough of a problem – our data is still very low-dimensional.

After subsetting the data, I then employed mini-batch training, which includes taking subsets of our shrunken data set defined as “batches” and apply regression models to this subset. We then iterate through this and update our model using each iteration. We run this algorithm until we hit the total number of batches, which is defined by the size of our training data set divided by the batch size (an arbitrary number we set – not too high or else the model trains very slow).

# data set is huge -- need to split it up randomly and rerun it -- use mini-batch training
# First -- create a subset of the original data set 1/4 of the size (32m -> 8m)
set.seed(444)
full_order_data <- full_order_data[sample(nrow(full_order_data), 8000000),]

# Second -- split original data set into training & test data (80/20)
train_index <- sample(1:nrow(full_order_data), 0.8*nrow(full_order_data))
train_data <- full_order_data[train_index,]
test_data <- full_order_data[-train_index,]

# Define batch size & num of batches
batch_size <- 50000
num_batches <- ceiling(nrow(train_data) / batch_size)

# Replace NA's with 0s in data, since they just correspond to # of days since previous order -- so 0 makes sense
# First, on the training data
train_data <- train_data %>% 
  mutate_all(~ifelse(is.na(.), 0, .))
# Test to see if it worked -- yes it did
sum(is.na(train_data)) # 0
## [1] 0
# Now same on test data
test_data <- test_data %>% 
  mutate_all(~ifelse(is.na(.), 0, .))
# Again, we check -- looks good
sum(is.na(test_data)) # 0
## [1] 0
# Summary statistics for all variables in final data subset
summary(full_order_data)
##     order_id          user_id         eval_set          order_number  
##  Min.   :      2   Min.   :     1   Length:8000000     Min.   : 1.00  
##  1st Qu.: 855816   1st Qu.: 51344   Class :character   1st Qu.: 5.00  
##  Median :1711068   Median :102562   Mode  :character   Median :11.00  
##  Mean   :1710795   Mean   :102907                      Mean   :17.13  
##  3rd Qu.:2565888   3rd Qu.:154381                      3rd Qu.:24.00  
##  Max.   :3421083   Max.   :206209                      Max.   :99.00  
##                                                                       
##      order_dow       order_hour_of_day days_since_prior_order add_to_cart_order
##  Saturday :1531940   Min.   : 0.00     Min.   : 0.0           Min.   :  1.000  
##  Sunday   :1399493   1st Qu.:10.00     1st Qu.: 5.0           1st Qu.:  3.000  
##  Monday   :1039624   Median :13.00     Median : 8.0           Median :  6.000  
##  Tuesday  : 947785   Mean   :13.42     Mean   :11.1           Mean   :  8.353  
##  Wednesday: 933892   3rd Qu.:16.00     3rd Qu.:15.0           3rd Qu.: 11.000  
##  Thursday :1037663   Max.   :23.00     Max.   :30.0           Max.   :143.000  
##  Friday   :1109603                     NA's   :512792                          
##    reordered      product_name          aisle            department       
##  Min.   :0.0000   Length:8000000     Length:8000000     Length:8000000    
##  1st Qu.:0.0000   Class :character   Class :character   Class :character  
##  Median :1.0000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.5895                                                           
##  3rd Qu.:1.0000                                                           
##  Max.   :1.0000                                                           
##                                                                           
##  product_count       avg_hour        avg_day      product_count_day
##  Min.   : 1.000   Min.   : 0.00   Min.   :1.000   Min.   :    1    
##  1st Qu.: 2.000   1st Qu.:12.20   1st Qu.:3.244   1st Qu.:  237    
##  Median : 4.000   Median :13.41   Median :3.754   Median :  982    
##  Mean   : 7.624   Mean   :13.42   Mean   :3.739   Mean   : 5423    
##  3rd Qu.:10.000   3rd Qu.:14.62   3rd Qu.:4.226   3rd Qu.: 4216    
##  Max.   :99.000   Max.   :23.00   Max.   :7.000   Max.   :96769    
##                                                                    
##  product_count_hour   order_size     avg_order_size  total_reorders  
##  Min.   :    1      Min.   :  1.00   Min.   : 1.00   Min.   :     0  
##  1st Qu.:   96      1st Qu.:  9.00   1st Qu.:10.50   1st Qu.:   832  
##  Median :  414      Median : 14.00   Median :14.56   Median :  3876  
##  Mean   : 2403      Mean   : 15.71   Mean   :15.70   Mean   : 26651  
##  3rd Qu.: 1774      3rd Qu.: 21.00   3rd Qu.:19.63   3rd Qu.: 18541  
##  Max.   :40731      Max.   :145.00   Max.   :78.55   Max.   :398609  
##                                                                      
##  product_reorder_ratio  reorder_rate  
##  Min.   :0.0000        Min.   :0.397  
##  1st Qu.:0.5000        1st Qu.:0.541  
##  Median :0.7500        Median :0.600  
##  Mean   :0.5897        Mean   :0.590  
##  3rd Qu.:0.9000        3rd Qu.:0.658  
##  Max.   :0.9899        Max.   :0.678  
##                        NA's   :6512
# Loop for getting mean & SD for all variables
mean_values <- c()
sd_values <- c()

for (i in colnames(full_order_data)){
  # Check if the column is numeric or integer
  if (is.numeric(full_order_data[[i]]) || is.integer(full_order_data[[i]])){
    column_mean <- mean(full_order_data[[i]], na.rm = TRUE)
    column_sd <- sd(full_order_data[[i]], na.rm = TRUE)
    mean_values <- c(mean_values, column_mean)
    sd_values <- c(sd_values, column_sd)
  }
}
# Maybe need to deal with NA's here
mean_sd_df <- data.frame("Variables" = colnames(full_order_data)[-c(3,5,10:12)],
                         "Mean" = c(mean_values), 
                         "SD" = c(sd_values))

mean_sd_df
##                 Variables         Mean           SD
## 1                order_id 1.710795e+06 9.873501e+05
## 2                 user_id 1.029068e+05 5.948021e+04
## 3            order_number 1.713174e+01 1.752758e+01
## 4       order_hour_of_day 1.342473e+01 4.247423e+00
## 5  days_since_prior_order 1.110160e+01 8.775010e+00
## 6       add_to_cart_order 8.352539e+00 7.128817e+00
## 7               reordered 5.895110e-01 4.919226e-01
## 8           product_count 7.623776e+00 9.816934e+00
## 9                avg_hour 1.342474e+01 1.876602e+00
## 10                avg_day 3.738628e+00 8.436293e-01
## 11      product_count_day 5.423167e+03 1.243822e+04
## 12     product_count_hour 2.402714e+03 5.641647e+03
## 13             order_size 1.570720e+01 9.537659e+00
## 14         avg_order_size 1.570366e+01 7.339443e+00
## 15         total_reorders 2.665121e+04 6.543087e+04
## 16  product_reorder_ratio 5.896708e-01 3.647764e-01
## 17           reorder_rate 5.898350e-01 6.937618e-02

Logit

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(class)
### Logit ###
#############

# Mini-batch training loop
for (i in 1:num_batches) {
  # Calculate start and end indices for the current batch
  start_index <- ((i - 1) * batch_size) + 1
  end_index <- min(i * batch_size, nrow(train_data))
  
  # Read the current batch of training data
  batch_data <- train_data[start_index:end_index, ]

  insta_logit <-  glm(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
        add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + product_count_hour +
        order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate, 
      data = batch_data, family = "binomial")
}

# Summary statistics -- we find that order_number, order_dowTueday (relative to Saturday), days_since_prior_order, add_to_cart_order, avg_hour, order_size, avg_order_size, and product_reorder_rate are all statistically significant at all standard significance levels.
summary(insta_logit)
## 
## Call:
## glm(formula = reordered ~ order_number + order_dow + order_hour_of_day + 
##     days_since_prior_order + add_to_cart_order + avg_hour + avg_day + 
##     product_count + product_count_day + product_count_hour + 
##     order_size + avg_order_size + total_reorders + product_reorder_ratio + 
##     reorder_rate, family = "binomial", data = batch_data)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -5.117e+00  4.524e-01 -11.311  < 2e-16 ***
## order_number            5.868e-02  1.196e-03  49.076  < 2e-16 ***
## order_dow              -5.960e-04  7.853e-03  -0.076  0.93950    
## order_hour_of_day       2.097e-03  3.806e-03   0.551  0.58165    
## days_since_prior_order  5.977e-02  1.740e-03  34.356  < 2e-16 ***
## add_to_cart_order      -5.132e-02  6.854e-03  -7.487 7.03e-14 ***
## avg_hour                4.391e-03  8.571e-03   0.512  0.60844    
## avg_day                 6.749e-03  1.845e-02   0.366  0.71452    
## product_count          -7.485e-03  2.734e-03  -2.737  0.00620 ** 
## product_count_day       7.005e-08  4.876e-06   0.014  0.98854    
## product_count_hour      1.173e-05  7.874e-06   1.490  0.13626    
## order_size              1.311e-02  2.853e-03   4.594 4.34e-06 ***
## avg_order_size          8.929e-03  3.147e-03   2.838  0.00454 ** 
## total_reorders         -7.482e-07  1.064e-06  -0.703  0.48195    
## product_reorder_ratio   7.359e+00  8.529e-02  86.279  < 2e-16 ***
## reorder_rate           -8.158e-01  6.487e-01  -1.258  0.20855    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 67613  on 49999  degrees of freedom
## Residual deviance: 31101  on 49984  degrees of freedom
## AIC: 31133
## 
## Number of Fisher Scoring iterations: 6
## Misclassificiation error rate
logit_preds <- predict(insta_logit, newdata = test_data, type = "response")
# Set A threshold value = 0.5
logit_threshold=ifelse(logit_preds>0.5,"1","0")
# Confusion Matrix
table(logit_threshold, test_data$reordered) 
##                
## logit_threshold      0      1
##               0 502815  77995
##               1 153862 865328
# Test Error (diagonal/total)
mean(logit_threshold==test_data$reordered, na.rm = T) # logit model correctly predicts 86.37% of the time
## [1] 0.8550894
# I.e. misclas error rate = 13.63%
logit_test_error <- 1-mean(logit_threshold==test_data$reordered, na.rm = T)

# ROC Curve & AUC Value
roc_curve_logit <- roc(test_data$reordered, logit_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_logit, main = "ROC Curve", col = "blue")

auc_logit <- auc(roc_curve_logit)
print(paste("AUC:", auc_logit))
## [1] "AUC: 0.933023138690563"

LDA & QDA

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
### LDA & QDA ###
#################
set.seed(101)
for (i in 1:num_batches) {
  # Calculate start and end indices for the current batch
  start_index <- ((i - 1) * batch_size) + 1
  end_index <- min(i * batch_size, nrow(train_data))
  
  # Read the current batch of training data
  batch_data <- train_data[start_index:end_index, ]
  
  insta_LDA <- lda(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order + 
                     add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + product_count_hour +
                     order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate,
                   data = batch_data)
  
  insta_QDA <- qda(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
                   add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + product_count_hour +
                   order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate,
                 data = batch_data)
  }

lda.pred = predict(insta_LDA, newdata = test_data, type="response")
qda.pred = predict(insta_QDA, newdata = test_data, type="response")

# Plot of LDA model
plot(insta_LDA)

# Need to access column 'class' for LDA/QDA to create confusion matrix and calculate test error
table(lda.pred$class, test_data$reordered)
##    
##          0      1
##   0 465639  50118
##   1 191038 893205
mean(lda.pred$class==test_data$reordered, na.rm = T) # LDA Correctly predicts 84.93% of time - misclassification error rate of 15.07%.
## [1] 0.8492775
lda_test_error <- 1-mean(lda.pred$class==test_data$reordered, na.rm = T)

table(qda.pred$class, test_data$reordered) # QDA correctly predicts 83.00% of the time - misclas error rate = 17.00%. 
##    
##          0      1
##   0 594821 319621
##   1  61856 623702
mean(qda.pred$class==test_data$reordered, na.rm = T)
## [1] 0.7615769
qda_test_error <- 1-mean(qda.pred$class==test_data$reordered, na.rm = T)

# AUC (LDA)
roc_curve_lda <- roc(test_data$reordered, as.numeric(lda.pred$class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_lda, main = "ROC Curve", col = "blue")

auc_lda <- auc(roc_curve_lda)
print(paste("AUC:", auc_lda))
## [1] "AUC: 0.827977280899065"
# AUC (QDA)
roc_curve_qda <- roc(test_data$reordered, as.numeric(qda.pred$class))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_qda, main = "ROC Curve", col = "blue")

auc_qda <- auc(roc_curve_qda)
print(paste("AUC:", auc_qda))
## [1] "AUC: 0.78348998419337"

kNN

Since kNN is a “lazy” learning method, a large data set takes way too long to train the model on. I tried this model initially on 10m observations, and it was gonna take over a day to run. So, we needed a much smaller subset of data – we created a new subset on our 8m training data that we subsetted earlier, with a new subset of only 300,000 observations. This is a decrease in size by a factor of 26! Thus, our results should be taken with a grain of salt, since it might not be representative of our data and its findings, but it still should hopefully give us some meaningful insights into our results and give a sensible misclassification error rate.

library(class)
set.seed(18)
# kNN
 #only gonna use significant predictors from previous models in KNN model to reduce feature space
predictors <- c("order_number", "order_dow", "days_since_prior_order", 
    "add_to_cart_order", "avg_hour", "order_size", "avg_order_size", "product_reorder_ratio")

# Create a subset of data to use , since the larger the data points the more computationally expensive kNN becomes
knn_subset_data <- train_data[sample(nrow(train_data), 300000),]

knn_train_index <- sample(1:nrow(knn_subset_data), 0.8*nrow(knn_subset_data))
knn_train_data <- knn_subset_data[knn_train_index,]
knn_test_data <- knn_subset_data[-knn_train_index,]

# Experimented with different batch sizes
knn_batch_size = 2500
knn_num_batches <- ceiling(nrow(knn_train_data) / knn_batch_size)
  
## Find optimal K value for kNN using 10-fold CV
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:httr':
## 
##     progress
# Define range of k values
k_values <- c(5, 7, 9, 11)  # Example range of odds 5-11.

# Define the number of folds for cross-validation
num_folds <- 5  # Example: 10-fold cross-validation

# Define the control parameters for cross-validation
ctrl <- trainControl(method = "cv",  # Use cross-validation
                     number = num_folds)  # Number of folds

# Train kNN model using different values of k
knn_train <- train(as.factor(reordered) ~ order_number + order_dow + days_since_prior_order +
                     add_to_cart_order + avg_hour + order_size + avg_order_size +
                     product_reorder_ratio,  
                   data = knn_train_data,  # Training data
                   method = "knn",  # kNN method
                   trControl = ctrl,  # Cross-validation control
                   tuneGrid = data.frame(k = k_values))  # Grid of k values

# Print the results
print(knn_train) # best results from k = 1:5 -- 5 had highest accuracy (0.7162) -- then we tried 5, 7, 9, & 11 -- 11 had the highest
## k-Nearest Neighbors 
## 
## 240000 samples
##      8 predictor
##      2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 192000, 192001, 192000, 191999, 192000 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.7178208  0.4015090
##    7  0.7247958  0.4140414
##    9  0.7280875  0.4195463
##   11  0.7315166  0.4257158
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
# Plot performance metric vs. k values
plot(knn_train)

best_neighbors <- knn_train$bestTune
## Loop through batches
for (i in 1:knn_num_batches) {
  # Get indices for the current batch
  start_index <- (i - 1) * knn_batch_size + 1
  end_index <- min(i * knn_batch_size, nrow(knn_train_data))
  
  # Read the current batch of training data
  knn_batch_data <- knn_train_data[start_index:end_index, ]
  
# Train kNN model on current batch
  knn_model <- knn(train = knn_batch_data[, predictors], 
                   test = knn_test_data[, predictors], 
                   cl = knn_batch_data$reordered, 
                   k = best_neighbors[1,1])
}

table(knn_model, knn_test_data$reordered) # KNN with K=11 confusion matrix
##          
## knn_model     0     1
##         0 13271  6147
##         1 11388 29194
mean(knn_model==knn_test_data$reordered, na.rm = T) # Prediction accuracy of 70.51%.
## [1] 0.70775
knn_test_error <- 1-mean(knn_model==knn_test_data$reordered, na.rm = T)

# AUC Curve
roc_curve_knn <- roc(knn_test_data$reordered, as.numeric(knn_model))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_knn, main = "ROC Curve", col = "blue")

auc_knn <- auc(roc_curve_knn)
print(paste("AUC:", auc_knn))
## [1] "AUC: 0.682123414096897"

Classification Tree

library(tree)
### Classification Tree ###
###########################
set.seed(99)
for (i in 1:num_batches) {
  # Calculate start and end indices for the current batch
  start_index <- ((i - 1) * batch_size) + 1
  end_index <- min(i * batch_size, nrow(train_data))
  
  # Read the current batch of training data
  batch_data <- train_data[start_index:end_index, ]

  insta_tree <-  tree(as.factor(reordered) ~ order_number + order_dow + order_hour_of_day + 
                        days_since_prior_order + add_to_cart_order + avg_hour + avg_day + 
                        product_count_day + product_count_hour + order_size + avg_order_size + 
                        total_reorders + product_reorder_ratio + reorder_rate + product_count, 
                      data = batch_data)
}

# predicting values in our test split using our subsetted model
tree.pred <- predict(insta_tree, newdata = test_data, type = "class")
# confusion matrix
table(tree.pred, test_data$reordered) # correctly predicts 87.74% of the time. Misclas error rate of 12.26%.
##          
## tree.pred      0      1
##         0 488289  27711
##         1 168388 915612
mean(tree.pred==test_data$reordered)
## [1] 0.8774381
# Cross-Validation on our Classification Tree
cv.insta <- cv.tree(insta_tree, FUN = prune.misclass) # using cv to determine optimal level of tree complexity
cv.insta$size # tree sizes
## [1] 6 3 2 1
cv.insta$dev # tree size of 3 or 6 terminal nodes minimizes CV classification error 
## [1]  6077  6077  8233 20401
cv.insta$k # alpha value of 0 -- no complexity penalty (unpruned tree minimizes our CV classification error)
## [1]  -Inf     0  2156 12168
# Plotting CV Trees
par(mfrow = c(1, 2))
plot(cv.insta$size , cv.insta$dev, type = "b") # Plot CV Classification error against tree size
min_dev = which.min(cv.insta$dev); min_dev # CV Classification error minimized at the first value (cv.insta$dev[1])
## [1] 1
points(cv.insta$size[min_dev], cv.insta$dev[min_dev], col = "red", cex = 2, pch=20)

plot(cv.insta$k, cv.insta$dev, type = "b") # Plot CV Classification error against alpha values
points(0, cv.insta$dev[min_dev], col = "red", cex = 2, pch = 20)

# Pruning of our tree
prune.insta <- prune.misclass(insta_tree , best = 6)
plot(prune.insta); text(prune.insta , pretty = 0, cex = 0.5) # tree with 6 terminal nodes. We see the following vars: product_reorder_ratio & order_number

tree.pred2 <- predict(prune.insta, test_data, # predicting values in our test split using our subsetted model
                     type = "class")
table(tree.pred2, test_data$reordered) # same table as before -- test error of 87.74%
##           
## tree.pred2      0      1
##          0 488289  27711
##          1 168388 915612
mean(tree.pred2==test_data$reordered)
## [1] 0.8774381
tree_test_error <- 1-mean(tree.pred2==test_data$reordered)

# AUC Curve
roc_curve_tree <- roc(test_data$reordered, as.numeric(tree.pred2))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_tree, main = "ROC Curve", col = "blue")

auc_tree <- auc(roc_curve_tree)
print(paste("AUC:", auc_tree))
## [1] "AUC: 0.85709983427411"

Random Forest

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
### Random Forests ###
######################
set.seed(38)
# Tried with a smaller batch size to save time -- took 20 mins at 1/5 of the size -- over an hour for full size
rf_batch_size <- 10000

for (i in 1:num_batches) {
  # Calculate start and end indices for the current batch
  start_index <- ((i - 1) * rf_batch_size) + 1
  end_index <- min(i * rf_batch_size, nrow(train_data))
  
  # Read the current batch of training data
  batch_data <- train_data[start_index:end_index, ]

  insta_rf <-  randomForest(as.factor(reordered) ~ order_number + order_dow + order_hour_of_day + 
                              days_since_prior_order + add_to_cart_order + avg_hour + avg_day + 
                              product_count_day + product_count_hour +order_size + avg_order_size + 
                              total_reorders + product_reorder_ratio + reorder_rate + product_count, 
                            data = batch_data)
}

# Variable importance
importance(insta_rf)
##                        MeanDecreaseGini
## order_number                  598.29762
## order_dow                      72.31885
## order_hour_of_day             100.60654
## days_since_prior_order        307.51266
## add_to_cart_order             102.96149
## avg_hour                      150.68469
## avg_day                       159.95777
## product_count_day             157.84163
## product_count_hour            154.91041
## order_size                    118.35267
## avg_order_size                158.89244
## total_reorders                169.56384
## product_reorder_ratio        1245.54101
## reorder_rate                   98.23815
## product_count                1232.94912
varImpPlot(insta_rf)

rf_preds <- predict(insta_rf, newdata = test_data, 
                    type = "class")
table(rf_preds, test_data$reordered)
##         
## rf_preds      0      1
##        0 502670  35540
##        1 154007 907783
# Misclassification Error Rate (test error) of 11.77%. Correctly predicts 88.23% of the values.
mean(rf_preds==test_data$reordered)
## [1] 0.8815331
rf_test_error <- 1-mean(rf_preds==test_data$reordered)

# AUC Curve
roc_curve_rf <- roc(test_data$reordered, as.numeric(rf_preds))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_rf, main = "ROC Curve", col = "blue")

auc_rf <- auc(roc_curve_rf)
print(paste("AUC:", auc_rf))
## [1] "AUC: 0.863899969868076"

Boosting

Because our memory is so full at this point, we can only run regression of size n=50000. If the memory clears up I can try running this regression again & seeing what results I can get at a much larger sample size, with more hypertuning of the parameters.

### Boosting ###
################
# Need to make a subset of our data to run on xgboost
set.seed(88)
# Create subset & indexes
xgboost_subset_data <- train_data[sample(nrow(train_data), 62500),]

xgboost_train_index <- sample(1:nrow(xgboost_subset_data), 0.8*nrow(xgboost_subset_data))
xgboost_train_data <- xgboost_subset_data[xgboost_train_index,]
xgboost_test_data <- xgboost_subset_data[-xgboost_train_index,]

# Train data subset
xgboost_train_data <- xgboost_train_data %>% 
  mutate(reordered = as.factor(reordered)) %>%
  dplyr::select(order_number, order_dow, order_hour_of_day, days_since_prior_order,
         add_to_cart_order, avg_hour, avg_day, product_count, product_count_day, 
         product_count_hour, order_size, avg_order_size, total_reorders, 
         product_reorder_ratio, reorder_rate, reordered) %>% 
  glimpse
## Rows: 50,000
## Columns: 16
## $ order_number           <int> 2, 7, 3, 1, 27, 73, 18, 8, 2, 9, 6, 32, 52, 33,…
## $ order_dow              <int> 4, 2, 1, 2, 2, 6, 4, 2, 3, 6, 1, 2, 1, 5, 5, 7,…
## $ order_hour_of_day      <int> 9, 17, 15, 13, 18, 13, 22, 9, 6, 15, 7, 8, 13, …
## $ days_since_prior_order <dbl> 5, 25, 0, 0, 5, 2, 8, 30, 14, 7, 8, 22, 9, 14, …
## $ add_to_cart_order      <int> 1, 13, 1, 10, 8, 4, 2, 9, 4, 7, 11, 21, 18, 15,…
## $ avg_hour               <dbl> 11.36000, 15.95575, 15.77778, 13.88732, 14.2329…
## $ avg_day                <dbl> 5.720000, 4.371681, 4.203704, 3.690141, 3.75000…
## $ product_count          <int> 1, 1, 2, 1, 11, 1, 8, 1, 3, 6, 1, 35, 1, 20, 2,…
## $ product_count_day      <int> 3425, 2007, 110, 1086, 51, 753, 109, 481, 667, …
## $ product_count_hour     <int> 2315, 752, 58, 566, 20, 418, 11, 213, 41, 839, …
## $ order_size             <int> 8, 14, 8, 27, 12, 14, 16, 18, 15, 13, 13, 23, 2…
## $ avg_order_size         <dbl> 8.360000, 11.548673, 8.740741, 19.760563, 11.08…
## $ total_reorders         <int> 22713, 7995, 382, 2581, 158, 2793, 318, 1086, 2…
## $ product_reorder_ratio  <dbl> 0.0000000, 0.0000000, 0.5000000, 0.0000000, 0.9…
## $ reorder_rate           <dbl> 0.6775329, 0.5247756, 0.6775329, 0.5510178, 0.5…
## $ reordered              <fct> 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,…
# Convert into numeric matrix to put into boosting function
xgboost_train_data_x <- as.matrix(xgboost_train_data[, -16])
xgboost_train_data_x <- apply(xgboost_train_data_x, 2, as.numeric)

# Test data subset
xgboost_test_data <- xgboost_test_data %>% 
  mutate(reordered = as.factor(reordered)) %>% 
  dplyr::select(order_number, order_dow, order_hour_of_day, days_since_prior_order, 
         add_to_cart_order, avg_hour, avg_day, product_count, product_count_day, 
         product_count_hour, order_size, avg_order_size, total_reorders, 
         product_reorder_ratio, reorder_rate, reordered) %>% 
  glimpse
## Rows: 12,500
## Columns: 16
## $ order_number           <int> 22, 26, 12, 27, 3, 2, 14, 12, 74, 16, 23, 22, 1…
## $ order_dow              <int> 4, 2, 7, 1, 4, 4, 2, 2, 6, 4, 3, 1, 2, 1, 4, 4,…
## $ order_hour_of_day      <int> 17, 12, 16, 10, 12, 20, 14, 9, 16, 12, 14, 17, …
## $ days_since_prior_order <dbl> 5, 7, 7, 8, 30, 30, 7, 7, 2, 6, 4, 7, 10, 11, 1…
## $ add_to_cart_order      <int> 1, 4, 2, 5, 3, 21, 6, 13, 4, 13, 11, 24, 5, 7, …
## $ avg_hour               <dbl> 9.205263, 13.799686, 11.829787, 13.146269, 13.7…
## $ avg_day                <dbl> 5.473684, 3.102121, 5.179331, 3.040299, 3.78082…
## $ product_count          <int> 34, 1, 10, 30, 2, 1, 2, 3, 1, 13, 9, 3, 11, 8, …
## $ product_count_day      <int> 2858, 398, 1095, 8211, 350, 23, 1159, 859, 3283…
## $ product_count_hour     <int> 1613, 237, 620, 4563, 240, 4, 622, 403, 2170, 4…
## $ order_size             <int> 5, 6, 21, 24, 7, 33, 16, 22, 6, 30, 13, 30, 5, …
## $ avg_order_size         <dbl> 5.031579, 21.928515, 25.054711, 20.650746, 8.31…
## $ total_reorders         <int> 15925, 613, 5638, 38544, 1457, 51, 3656, 3544, …
## $ product_reorder_ratio  <dbl> 0.9705882, 0.0000000, 0.9000000, 0.9666667, 0.5…
## $ reorder_rate           <dbl> 0.6775329, 0.6369578, 0.6762507, 0.6173831, 0.6…
## $ reordered              <fct> 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,…
# Convert into numeric matrix for test error rate calculation at end
xgboost_test_data_x <- as.matrix(xgboost_test_data[, -16])
xgboost_test_data_x <- apply(xgboost_test_data_x, 2, as.numeric)

library(caret)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
# Tune parameters
grid_tune <- expand.grid(
  nrounds = c(500, 1000, 1500), # number of trees
  max_depth = c(2, 4, 6),
  eta = c(0.1, 0.3), # could try c(0.025, 0.05, 0.1, 0.3) # learning rate
  gamma = 0, 
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = 1
)

# Cross-validation
train_control <- trainControl(method = "cv",
                              number = 5,
                              verboseIter = TRUE,
                              allowParallel = TRUE)

xgb_tune <- train(x = xgboost_train_data_x,
                  y = xgboost_train_data[, 16],
                  trControl = train_control,
                  tuneGrid = grid_tune,
                  method = "xgbTree",
                  verbose = TRUE)
## + Fold1: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [07:40:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:40:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold1: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [07:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:44:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold1: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [07:50:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:50:30] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold1: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [07:52:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:52:17] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold1: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [07:55:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [07:55:26] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold1: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:00:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:00:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold1: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold2: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:02:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:02:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold2: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:06:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:06:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold2: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:10:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:10:36] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold2: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:12:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:12:07] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold2: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:15:12] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:15:13] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold2: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:25:39] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:25:40] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold2: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold3: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:28:02] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:28:03] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold3: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:31:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:31:15] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold3: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:36:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:36:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold3: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:37:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:37:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold3: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [08:40:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [08:40:51] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold3: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:03:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:03:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold3: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold4: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:04:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:04:41] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold4: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:07:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:07:33] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold4: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:11:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:11:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold4: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:14:49] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:14:50] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold4: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:18:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:18:24] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold4: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:20:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:20:21] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold4: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold5: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:20:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:20:58] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.1, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold5: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:22:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:22:10] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.1, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold5: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:23:54] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:23:55] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.1, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold5: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:24:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:24:29] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.3, max_depth=2, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold5: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:25:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:25:35] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.3, max_depth=4, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## + Fold5: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## [09:27:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## [09:27:18] WARNING: src/c_api/c_api.cc:935: `ntree_limit` is deprecated, use `iteration_range` instead.
## - Fold5: eta=0.3, max_depth=6, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1, nrounds=1500 
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 500, max_depth = 2, eta = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 on full training set
xgb_tune
## eXtreme Gradient Boosting 
## 
## 50000 samples
##    15 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 40000, 40001, 39999, 40000, 40000 
## Resampling results across tuning parameters:
## 
##   eta  max_depth  nrounds  Accuracy   Kappa    
##   0.1  2           500     0.8858598  0.7588373
##   0.1  2          1000     0.8849398  0.7572350
##   0.1  2          1500     0.8834598  0.7543262
##   0.1  4           500     0.8833399  0.7540533
##   0.1  4          1000     0.8803399  0.7480462
##   0.1  4          1500     0.8789999  0.7455985
##   0.1  6           500     0.8811599  0.7497944
##   0.1  6          1000     0.8778799  0.7432704
##   0.1  6          1500     0.8771799  0.7419598
##   0.3  2           500     0.8832398  0.7539887
##   0.3  2          1000     0.8810798  0.7498776
##   0.3  2          1500     0.8786198  0.7449194
##   0.3  4           500     0.8761000  0.7398405
##   0.3  4          1000     0.8726400  0.7330412
##   0.3  4          1500     0.8718599  0.7314719
##   0.3  6           500     0.8748399  0.7375824
##   0.3  6          1000     0.8743799  0.7366667
##   0.3  6          1500     0.8729199  0.7338152
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
##  = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1 and subsample = 1.
best_params <- xgb_tune$bestTune # nrounds 500, depth 2, eta 0.1

# Final model -- grid
final_grid <- expand.grid(
  nrounds = best_params$nrounds,
  max_depth = best_params$max_depth,
  eta = best_params$eta,
  gamma = best_params$gamma, 
  colsample_bytree = best_params$colsample_bytree,
  min_child_weight = best_params$min_child_weight,
  subsample = best_params$subsample
)

xgb_final <- train(x = xgboost_train_data_x,
                  y = xgboost_train_data[, 16],
                  trControl = train_control,
                  tuneGrid = final_grid,
                  method = "xgbTree",
                  verbose = TRUE)
## + Fold1: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## - Fold1: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold2: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## - Fold2: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold3: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## - Fold3: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold4: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## - Fold4: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## + Fold5: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## - Fold5: nrounds=500, max_depth=2, eta=0.1, gamma=0, colsample_bytree=1, min_child_weight=1, subsample=1 
## Aggregating results
## Fitting final model on full training set
xgb_final
## eXtreme Gradient Boosting 
## 
## 50000 samples
##    15 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 40001, 40000, 40000, 40000, 39999 
## Resampling results:
## 
##   Accuracy  Kappa   
##   0.88618   0.759444
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
##  held constant at a value of 1
## Tuning parameter 'subsample' was held
##  constant at a value of 1
# Misclassification error rate of 11.56%.
boost_preds <- predict(xgb_final, newdata = xgboost_test_data_x, type = "raw")
table(boost_preds, xgboost_test_data$reordered)
##            
## boost_preds    0    1
##           0 3923  317
##           1 1128 7132
mean(boost_preds==xgboost_test_data$reordered) # Correctly predicts 88.44% of the time.
## [1] 0.8844
boost_test_error <- 1-mean(boost_preds==xgboost_test_data$reordered)

# Evaluate model
library(pROC)
roc_curve_boost <- roc(xgboost_test_data$reordered, as.numeric(boost_preds))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_boost, main = "ROC Curve", col = "blue")

auc_boost <- auc(roc_curve_boost)
print(paste("AUC:", auc_boost)) # AUC: 86.80
## [1] "AUC: 0.867060918887782"
## Plots of best tuning hyperparameters
tuning_results <- data.frame(xgb_tune$results)


# heatmap
ggplot(tuning_results, aes(x = max_depth, y = eta, z = Accuracy)) +
  geom_tile(aes(fill = Accuracy)) +  # Color represents accuracy
  scale_fill_viridis_c() +
  labs(title = "XGBoost Tuning - Accuracy Heatmap",
       x = "Max Depth",
       y = "Learning Rate (eta)",
       z = "Accuracy") +
  theme_Publication() +
    theme(legend.key.width = unit(1.5, "cm"))

# Line Chart
  ggplot(tuning_results, aes(x = nrounds, y = Accuracy, color = eta)) +
  geom_line() +
  facet_wrap(~ max_depth) +  # Facet by max_depth
  labs(title = "XGBoost Tuning - Accuracy by Parameters",
       x = "Number of Trees (nrounds)",
       y = "Accuracy",
       color = "Learning Rate (eta)") +
  theme_Publication() +
    theme(legend.key.width = unit(1, "cm"))

SVM

set.seed(55)
# Create a subset of data to use
svm_subset_data <- train_data[sample(nrow(train_data), 10000),]

svm_train_index <- sample(1:nrow(svm_subset_data), 0.8*nrow(svm_subset_data))
svm_train_data <- svm_subset_data[svm_train_index,]
svm_test_data <- svm_subset_data[-svm_train_index,]

library(e1071)
# Find best hyperparameters for SVM model
tune.out <- tune(svm, 
                 reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
                   add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + 
                   product_count_hour + order_size + avg_order_size + total_reorders + 
                   product_reorder_ratio + reorder_rate, 
                 data = svm_train_data,
                 kernel = "radial",
                 ranges = list(
                   cost = c(1, 10),
                   gamma = c(0.5, 1)
                   )
                 )

tune.out
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.5
## 
## - best performance: 0.1191082
final_hypers <- tune.out$best.parameters

results <- as.data.frame(tune.out$performances)


# Plotting cost vs. gamma with performance as color
ggplot(results, aes(x = cost, y = gamma, fill = -error)) +
  geom_tile() +
  scale_fill_gradient(low = "yellow", high = "red", labels = round(1 - results$error, 2)) +
  labs(title = "Hyperparameter Tuning Results",
       x = "Cost",
       y = "Gamma",
       fill = "Accuracy") +
  theme_Publication() +
  theme(legend.key.width = unit(1, "cm"))
## Error in `scale_fill_gradient()`:
## ! `breaks` and `labels` have different lengths.
# Support Vector Machines
svmfit <- svm(reordered ~ order_number + order_dow + order_hour_of_day + days_since_prior_order +
                add_to_cart_order + avg_hour + avg_day + product_count + product_count_day + 
                product_count_hour + order_size + avg_order_size + total_reorders + product_reorder_ratio + reorder_rate, 
                data = svm_train_data, 
                kernel = "radial", 
                gamma = final_hypers$gamma, 
                cost = final_hypers$cost)

summary(svmfit)
## 
## Call:
## svm(formula = reordered ~ order_number + order_dow + order_hour_of_day + 
##     days_since_prior_order + add_to_cart_order + avg_hour + avg_day + 
##     product_count + product_count_day + product_count_hour + order_size + 
##     avg_order_size + total_reorders + product_reorder_ratio + reorder_rate, 
##     data = svm_train_data, kernel = "radial", gamma = final_hypers$gamma, 
##     cost = final_hypers$cost)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.5 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  6461
svm_preds <- predict(svmfit, newdata = svm_test_data)

svm_class_preds <- ifelse(svm_preds > 0.5, 1, 0)

table(svm_class_preds, svm_test_data$reordered) # Correctly predicts 84.7% of the time. 
##                
## svm_class_preds    0    1
##               0  563   57
##               1  276 1104
mean(svm_class_preds==svm_test_data$reordered) # Misclassification error rate of 15.3%
## [1] 0.8335
svm_test_error <- 1-mean(svm_class_preds==svm_test_data$reordered)

# Plot AUC
roc_curve_svm <- roc(svm_test_data$reordered, svm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve_svm, main = "ROC Curve", col = "blue")

auc_svm <- auc(roc_curve_svm)
print(paste("AUC:", auc_svm))
## [1] "AUC: 0.889341624241976"

Plot of Final Results (test errors)

final_test_errors <- c(logit_test_error, lda_test_error, qda_test_error,
                       knn_test_error, tree_test_error, rf_test_error,
                       boost_test_error, svm_test_error)
model_names <- c("Logit", "LDA", "QDA", "kNN", "Classification 
  Tree",
                 "Random Forest", "Boosting", "SVM")
# Plot
ggplot(data.frame(model_names, final_test_errors), aes(model_names, final_test_errors, fill = as.factor(model_names))) + 
  geom_bar(stat = "identity") +
  labs(title = "Test Error Comparison",
       x = "Model Names",
       y = "Misclassification Rate") +
  theme_Publication() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 9)) +
  scale_colour_Publication() +
  geom_text(aes(label = round(final_test_errors, 3)), vjust = -0.5, size = 3)

# Combine ROC curve data into one dataset
roc_data <- rbind(data.frame(Model = "Logit", FPR = roc_curve_logit$specificities, TPR = roc_curve_logit$sensitivities),
                  data.frame(Model = "LDA", FPR = roc_curve_lda$specificities, TPR = roc_curve_lda$sensitivities),
                  data.frame(Model = "QDA", FPR = roc_curve_qda$specificities, TPR = roc_curve_qda$sensitivities),
                  data.frame(Model = "kNN", FPR = roc_curve_knn$specificities, TPR = roc_curve_knn$sensitivities),
                  data.frame(Model = "Tree", FPR = roc_curve_tree$specificities, TPR = roc_curve_tree$sensitivities),
                  data.frame(Model = "Random Forest", FPR = roc_curve_rf$specificities, TPR = roc_curve_rf$sensitivities),
                  data.frame(Model = "Boosting", FPR = roc_curve_boost$specificities, TPR = roc_curve_boost$sensitivities),
                  data.frame(Model = "SVM", FPR = roc_curve_svm$specificities, TPR = roc_curve_svm$sensitivities))
# Repeat for other models...

# Plot ROC curves for all models
ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) +
  geom_line() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed") +  # Add diagonal line for reference
  labs(title = "ROC Curves for Different Models",
       x = "False Positive Rate",
       y = "True Positive Rate",
       color = "Model") +
  theme_minimal()

# Combine AUC data into one dataset
auc_data <- data.frame(Model = model_names, 
                       AUC = c(auc_logit, auc_lda, auc_qda, auc_knn, auc_tree, auc_rf, auc_boost, auc_svm))

# Plot AUC Scores for all models
ggplot(auc_data, aes(Model, AUC, fill = Model)) + 
  geom_bar(stat = "identity") +
  labs(title = "AUC Comparison",
       x = "Model Names",
       y = "AUC") +
  theme_Publication() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 9)) +
  scale_colour_Publication() +
  geom_text(aes(label = round(AUC, 3)), vjust = -0.5, size = 3)